This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
Attaching package: ‘tidyr’
The following object is masked _by_ ‘.GlobalEnv’:
billboard
library(car)
Warning: package ‘car’ was built under R version 4.3.2Loading required package: carData
Warning: package ‘carData’ was built under R version 4.3.2
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
The following object is masked from ‘package:plotly’:
select
library(repr)
Warning: package ‘repr’ was built under R version 4.3.2
library(pals)
Warning: package ‘pals’ was built under R version 4.3.2
library(ggpubr)
Warning: package ‘ggpubr’ was built under R version 4.3.2
library(plotly)
ecars_raw = read.csv('EV_cars.csv')
ecars_raw = ecars_raw %>% rename(Price = Price.DE., Acceleration = acceleration..0.100.)
make = strsplit(ecars_raw$Car_name, split = ' ')
make_ = c()
n = length(make)
for (i in 1:n) {
make_[i] = make[[i]][1]
}
ecars_raw$Make = make_
ecars_raw = ecars_raw %>% relocate(Make, .before = Car_name_link)
ecars_raw = ecars_raw %>% relocate(Battery, .after = Car_name_link)
ecars_raw = ecars_raw %>% filter(!is.na(Fast_charge))
ecars = ecars_raw %>% filter(!is.na(Price))
ecars_missing_price = ecars_raw %>% filter(is.na(Price))
ecars
ecars_missing_price
plot(ecars[,4:10],
main = 'Comparison of all Quantitive Features')
top_10 = ecars %>% group_by(Make) %>%
filter(n() >= 10)
top_10
make_colors = c('#e6194b', '#f58231', '#ffe119',
'#bcf60c','#3cb44b', '#008080',
'#aaffc3', '#4363d8', '#000075',
'#46f0f0', '#911eb4', '#e6beff',
'#f032e6', '#fabebe')
make_colors2 = c('#e6194b', '#f58231', '#ffe119',
'#bcf60c','#3cb44b', '#008080',
'#aaffc3', '#4363d8', '#000075',
'#46f0f0', '#911eb4', '#e6beff',
'#f032e6', '#fabebe', 'black')
make_other = unique(ecars$Make2)
make_other = sort(make_other)
make_other[15] = 'Other'
make_other
[1] "Audi" "BMW" "Citroen" "Fiat"
[5] "Hyundai" "Mercedes" "MG" "NIO"
[9] "Opel" "Peugeot" "Porsche" "Tesla"
[13] "Volkswagen" "Volvo" "Other"
ggplot(ecars, aes(x = Battery, y = Price, text = Car_name)) +
geom_point(aes(col = Make2)) +
scale_color_manual(name = "Make", values = make_colors2, labels = make_other)+
xlab("Battery Capacity (kWh)") +
ylab("Price in Germany (euros) ") +
ggtitle('Electric Vehicle Battery vs. Price (Makes with 10+ Models Highlighted)') +
theme(legend.position = "bottom")
Range_Bat = lm(Range ~ Battery, data = ecars)
summary(Range_Bat)
Call:
lm(formula = Range ~ Battery, data = ecars)
Residuals:
Min 1Q Median 3Q Max
-152.015 -27.740 6.636 34.682 123.700
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.1986 10.8427 3.615 0.000351 ***
Battery 4.6424 0.1461 31.780 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 52.04 on 305 degrees of freedom
Multiple R-squared: 0.7681, Adjusted R-squared: 0.7673
F-statistic: 1010 on 1 and 305 DF, p-value: < 2.2e-16
plot(Range_Bat)
price_model_empty = lm(Price ~ 1, data = ecars)
price_model_full= lm(Price ~ Battery + Efficiency + Fast_charge + Range + Top_speed + Acceleration, data = ecars)
summary(price_model_full)
Call:
lm(formula = Price ~ Battery + Efficiency + Fast_charge + Range +
Top_speed + Acceleration, data = ecars)
Residuals:
Min 1Q Median 3Q Max
-53557 -11739 -178 8223 84430
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.613e+05 2.398e+04 -6.724 8.91e-11 ***
Battery 2.215e+02 3.506e+02 0.632 0.52798
Efficiency 3.052e+02 1.140e+02 2.678 0.00781 **
Fast_charge 1.785e+01 7.819e+00 2.283 0.02315 *
Range 9.066e+00 6.679e+01 0.136 0.89211
Top_speed 7.013e+02 7.086e+01 9.897 < 2e-16 ***
Acceleration 1.762e+03 7.746e+02 2.274 0.02366 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18530 on 300 degrees of freedom
Multiple R-squared: 0.7167, Adjusted R-squared: 0.711
F-statistic: 126.5 on 6 and 300 DF, p-value: < 2.2e-16
scope = list(lower = formula(price_model_empty), upper = formula(price_model_full))
forwardAIC = step(price_model_empty, scope, direction = 'forward', k = 2)
Start: AIC=6415.84
Price ~ 1
Df Sum of Sq RSS AIC
+ Top_speed 1 2.1023e+11 1.5319e+11 6152.6
+ Battery 1 1.7911e+11 1.8431e+11 6209.4
+ Fast_charge 1 1.3923e+11 2.2419e+11 6269.5
+ Range 1 1.2615e+11 2.3728e+11 6287.0
+ Acceleration 1 1.0296e+11 2.6046e+11 6315.6
+ Efficiency 1 1.1067e+10 3.5235e+11 6408.3
<none> 3.6342e+11 6415.8
Step: AIC=6152.62
Price ~ Top_speed
Df Sum of Sq RSS AIC
+ Efficiency 1 4.2802e+10 1.1039e+11 6054.0
+ Battery 1 1.9965e+10 1.3322e+11 6111.8
+ Acceleration 1 1.5065e+10 1.3812e+11 6122.8
<none> 1.5319e+11 6152.6
+ Fast_charge 1 2.2387e+08 1.5297e+11 6154.2
+ Range 1 7.0491e+07 1.5312e+11 6154.5
Step: AIC=6054.03
Price ~ Top_speed + Efficiency
Df Sum of Sq RSS AIC
+ Fast_charge 1 3519695484 1.0687e+11 6046.1
+ Range 1 3506122724 1.0688e+11 6046.1
+ Battery 1 3063673448 1.0732e+11 6047.4
+ Acceleration 1 915987463 1.0947e+11 6053.5
<none> 1.1039e+11 6054.0
Step: AIC=6046.08
Price ~ Top_speed + Efficiency + Fast_charge
Df Sum of Sq RSS AIC
+ Range 1 2137906570 1.0473e+11 6041.9
+ Battery 1 2032210165 1.0484e+11 6042.2
+ Acceleration 1 713779942 1.0615e+11 6046.0
<none> 1.0687e+11 6046.1
Step: AIC=6041.87
Price ~ Top_speed + Efficiency + Fast_charge + Range
Df Sum of Sq RSS AIC
+ Acceleration 1 1639080066 1.0309e+11 6039.0
<none> 1.0473e+11 6041.9
+ Battery 1 1076868 1.0473e+11 6043.9
Step: AIC=6039.03
Price ~ Top_speed + Efficiency + Fast_charge + Range + Acceleration
Df Sum of Sq RSS AIC
<none> 1.0309e+11 6039.0
+ Battery 1 1.37e+08 1.0295e+11 6040.6
price_model_initial = lm(Price ~ Efficiency + Fast_charge + Range + Top_speed + Acceleration, data = ecars)
summary(price_model_initial)
Call:
lm(formula = Price ~ Efficiency + Fast_charge + Range + Top_speed +
Acceleration, data = ecars)
Residuals:
Min 1Q Median 3Q Max
-53655 -11767 -328 8350 82937
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.731e+05 1.497e+04 -11.561 < 2e-16 ***
Efficiency 3.730e+02 3.831e+01 9.736 < 2e-16 ***
Fast_charge 1.699e+01 7.692e+00 2.208 0.02797 *
Range 4.992e+01 1.669e+01 2.991 0.00301 **
Top_speed 7.047e+02 7.058e+01 9.984 < 2e-16 ***
Acceleration 1.637e+03 7.485e+02 2.188 0.02947 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18510 on 301 degrees of freedom
Multiple R-squared: 0.7163, Adjusted R-squared: 0.7116
F-statistic: 152 on 5 and 301 DF, p-value: < 2.2e-16
plot(price_model_initial)
bc = boxCox(price_model_initial)
lambda = bc$x[which(bc$y == max(bc$y))]
ecars$Price_lambda = (ecars$Price^lambda - 1)/lambda
price_model = lm(Price_lambda ~ Efficiency + Fast_charge + Range + Top_speed + Acceleration, data = ecars)
summary(price_model)
Call:
lm(formula = Price_lambda ~ Efficiency + Fast_charge + Range +
Top_speed + Acceleration, data = ecars)
Residuals:
Min 1Q Median 3Q Max
-2.951e-04 -4.127e-05 8.710e-06 4.794e-05 1.467e-04
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.413e+00 5.206e-05 27130.409 < 2e-16 ***
Efficiency 3.059e-06 1.332e-07 22.957 < 2e-16 ***
Fast_charge 1.407e-07 2.675e-08 5.259 2.75e-07 ***
Range 5.510e-07 5.805e-08 9.491 < 2e-16 ***
Top_speed 1.601e-06 2.455e-07 6.524 2.89e-10 ***
Acceleration -4.715e-06 2.603e-06 -1.812 0.071 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.436e-05 on 301 degrees of freedom
Multiple R-squared: 0.853, Adjusted R-squared: 0.8505
F-statistic: 349.3 on 5 and 301 DF, p-value: < 2.2e-16
plot(price_model)
broom::glance(price_model)
prediction = predict(price_model, ecars, interval = 'prediction')
confidence = predict(price_model, ecars, interval = 'confidence')
prediction_dollars = ((prediction * lambda) + 1)^(1/lambda)
confidence_dollars = ((confidence * lambda) + 1)^(1/lambda)
predicted_price = data.frame(Name = ecars$Car_name,
Make = ecars$Make,
Price = ecars$Price/1000,
Predicted = (prediction_dollars[,1]/1000),
Predict_lwr = (prediction_dollars[,2]/1000),
Predict_upr = (prediction_dollars[,3]/1000),
Confidence_lwr = (confidence_dollars[,2]/1000),
Confidence_upr = (confidence_dollars[,3]/1000))
predicted_price
most_makes = predicted_price %>%
group_by(Make)%>%
filter(n() >= 10) %>%
summarise(mean_price = mean(Price), mean_predicted = mean(Predicted))
most_makes
prediction_missing = predict(price_model, ecars_missing_price, interval = 'prediction')
prediction_missing_dollars = ((prediction_missing * lambda) + 1)^(1/lambda)
predicted_missing_price = data.frame(Name = ecars_missing_price$Car_name,
Make = ecars_missing_price$Make,
Predicted = (prediction_missing_dollars[,1]/1000))
predicted_missing_price
All EV models with average of Makes with 10+
1 + 1
[1] 2
a = ggplot(NULL, aes(Predicted_price, Price)) +
geom_smooth(data = predicted_price, aes(x = Predicted, y = Predicted),
col = 'blue', alpha = .8) +
geom_smooth(data = predicted_price, aes(x = Predicted, y = Predict_lwr),
col = 'red', linetype = 'dashed', alpha = .8) +
geom_smooth(data = predicted_price, aes(x = Predicted, y = Predict_upr),
col = 'red', linetype = 'dashed', alpha = .8) +
geom_smooth(data = predicted_price, aes(x = Predicted, y = Confidence_lwr),
col = 'black', linetype = 'dashed', alpha = .8) +
geom_smooth(data = predicted_price, aes(x = Predicted, y = Confidence_upr),
col = 'black', linetype = 'dashed', alpha = .8)
a + geom_point(data = predicted_price, aes(x = Predicted, y = Price, text = Price)) +
ylim(25, 250) + xlim(25, 250) +
xlab("Predicted price (euros in thousands)") + ylab("Price (euros in thousands)") +
ggtitle('Predicted Price vs. Price for all EV Models')
Warning: Ignoring unknown aesthetics: text
NA
b = a +
geom_point(data = predicted_price, aes(x = Predicted, y = Price, col = Make2)) +
theme(legend.position = "bottom", legend.text = element_text(size = 8))+
xlab("Predicted price (euros in thousands)") + ylab("Price (euros in thousands)") +
ggtitle('Predicted Price vs. Price for all EV Models') +
scale_color_manual(name = "Make", values = make_colors2, labels = make_other)
b + ylim(25, 200) + xlim(25, 180)
b + ylim(25, 75) + xlim(25, 75)
c = a +
geom_point(data = predicted_price, aes(x = Predicted, y = Price), alpha = .5) +
geom_point(data = most_makes, aes(x = mean_predicted, y = mean_price), size = 3) +
geom_point(data = most_makes, aes(x = mean_predicted, y = mean_price, col = Make), size = 2) +
theme(legend.position = "bottom", legend.text = element_text(size = 8))+
xlab("Predicted price (euros in thousands)") + ylab("Price (euros in thousands)") +
ggtitle('Predicted Price vs. Price for all EV Models') +
scale_color_manual(values = make_colors)
c + ylim(25, 250) + xlim(25, 250)
c + ylim(25, 75) + xlim(25, 75)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.